cleaned data: load packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)

Here I will clean the 2019_subway_rider_data

subway_rider_data = 
  read_csv("data/2019_Subway_Rider_Survey_20251121.csv") |> 
  janitor::clean_names()
## Rows: 10704 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): subway_line_used_most_often, use_of_subway_frequency, get_to_subwa...
## lgl  (1): is_subway_affordable
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
subway_drops =
  subway_rider_data |> 
  select(-survey_stop_borough, -survey_stop_location) |> 
  drop_na() |> 
  mutate(survey_id = row_number()) |> 
  relocate(survey_id, .before = 1)

Now I will specifically clean the data within the columns

Cleaned rider frequency table:

rider_data =
  subway_drops |>
  mutate(
    use_of_subway_frequency = case_when(
      use_of_subway_frequency == "Everyday / Almost everyday" ~ "Every day",
      use_of_subway_frequency == "More than once a week, varies" ~ "Several times a week",
      use_of_subway_frequency %in% c("Weekdays, Mon - Fri", "~5 days per week") ~ "5 days per week",
      use_of_subway_frequency == "Once a week" ~ "Once a week",
      use_of_subway_frequency == "Once a month" ~ "Once a month",
      use_of_subway_frequency == "Almost never" ~ "Almost never",
      use_of_subway_frequency == "Never" ~ "Never",
      TRUE ~ NA_character_
    ),
    use_of_subway_frequency = 
      factor(
        use_of_subway_frequency,
        levels = c("Never", 
                   "Almost never", 
                   "Once a month", 
                   "Once a week",
                   "Several times a week", 
                   "5 days per week", 
                   "Every day"),
        ordered = TRUE)
    )


rider_data =
  subway_drops |> 
  mutate(average_length_subway_ride = 
           factor(average_length_subway_ride,
                  levels = c("< 20 min", 
                             "20 - 40 min", 
                             "40 - 60 min",
                             "60 - 90 min", 
                             "90 min - 2 hours"),
                  ordered = TRUE))


rider_data =
  subway_drops |>
  mutate(
    overall_satisfaction = 
      recode(
        overall_satisfaction,
        "Not satisfied" = "Unsatisfied"),
    overall_satisfaction = 
      factor(
        overall_satisfaction,
        levels = c("Highly unsatisfied",
                 "Unsatisfied",
                 "Neutral",
                 "Somewhat satisfied",
                 "Very satisfied"),
      ordered = TRUE)
    )


rider_data =
  subway_drops |> 
  mutate(
    frequency_of_delays = factor(
      frequency_of_delays,
      levels = c("Everyday",
                 "A few times a week",
                 "Rarely",
                 "Almost never",
                 "Never"),
      ordered = TRUE)
    )

converting complaints into long format:

rider_data =
  rider_data |> 
  mutate(
    top_three_complaints = 
      str_remove_all(top_three_complaints, "\\[|\\]|'")
  ) |> 
  separate_rows(top_three_complaints, sep = ",") |> 
  mutate(top_three_complaints = 
           str_trim(top_three_complaints)) |> 
  rename(complaint = top_three_complaints)

Making some of the variables as a character and some as a factor!

rider_data = 
  rider_data |> 
  mutate(
    is_subway_affordable = if_else(
      is_subway_affordable == "Yes", TRUE, FALSE),
    zip_code = as.character(zip_code)
    )


rider_data =
  rider_data |> 
  mutate(
    across(c(subway_line_used_most_often,
             get_to_subway_via,
             primary_use_of_subway,
             alternative_transport), as.factor)
  )

<<<<<<< HEAD Converting zip codes into boroughs. I used https://www.nyc.gov/assets/planning/download/office/data-maps/nyc-population/census2000/sf1p11.xls to get majority of the borough’s zip codes. Any remaining ones, I looked up on Google. There were three zip codes that I could not find a match for, and were likely miswritten, so I removed those data rows.

zip_raw =
  readxl::read_excel("data/nyc_zipcodes.xls", skip = 4)
## New names:
## • `` -> `...2`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
# Step 2: Rename columns properly
colnames(zip_raw) <- c(
  "borough", "zipcode", "total_population", "white", "black_or_african_american",
  "american_indian_alaska_native", "asian", "native_hawaiian_other_pacific",
  "some_other_race", "two_or_more_races", "hispanic_origin", "total_housing_units"
)

# Step 3: Now select only borough and zipcode
zip_code_data <- zip_raw %>%
  select(borough, zipcode) %>%
  mutate(zipcode = as.character(zipcode))  # ensure character

# Step 4: Make rider_data zip_code a character
rider_data <- rider_data %>%
  mutate(zip_code = as.character(zip_code))

# Step 5: Join
rider_data_full <- rider_data %>%
  left_join(zip_code_data, by = c("zip_code" = "zipcode"))
## Warning in left_join(., zip_code_data, by = c(zip_code = "zipcode")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 127 of `x` matches multiple rows in `y`.
## ℹ Row 201 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Step 6: Fill borough where missing
rider_data_full <- rider_data_full %>%
  mutate(borough = coalesce(borough, as.character(zip_code)))

# Step 7: Remove invalid zip codes
rider_data_full <- rider_data_full %>%
  filter(!zip_code %in% c("12105", "11290", "11135"))

I think that maybe top 3 complaints shouldn’t be separated into individual rows because now it’s double or triple counting all of the other responses in other columns.

should approximate delay duration be an ordered factor?

final data set is rider_data_full

should approximate delay duration be an ordered factor?

anyways final data set is rider_data

Create Graphs

library(dplyr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

How do riders feel about their delays?

delay_reason =
  rider_data |>
  filter(
    !is.na(most_common_reason_for_delay),
    !is.na(overall_satisfaction)
  ) |>
  count(most_common_reason_for_delay, overall_satisfaction)
delay_reason_plot =
  plot_ly(
    delay_reason,
    x = ~most_common_reason_for_delay,
    y = ~n,
    color = ~overall_satisfaction,
    type = "bar",
    colors = "viridis",
    text = ~paste(
      "Delay reason:", most_common_reason_for_delay,
      "<br>Satisfaction:", overall_satisfaction,
      "<br>Riders:", n
    ),
    hoverinfo = "text"
  ) |>
  layout(
    barmode = "stack",
    title = "Delay Reason by Overall Satisfaction",
    xaxis = list(title = "Most Common Reason to Delay"),
    yaxis = list(title = "Number of Riders")
  )

delay_reason_plot

which trains are the worst with delays?

delay_counts =
  rider_data |>
  filter(frequency_of_delays %in% c("Everyday", "A few times a week")) |>
  count(subway_line_used_most_often, sort = TRUE)
delay_counts_plot =
  plot_ly(
  delay_counts,
  labels = ~subway_line_used_most_often,
  values = ~n,
  type = "pie"
) |>
  layout(
    title = "Subway Lines with the Most Delays"
  )

delay_counts_plot